import pandas as pd
import re

"""
This file is to combine our own codes together with the peercodes

@author: Jonathan van Oudheusden
@date: 2024-01-28

Required files: 
    - OwnCodes.xlsx
    with columns "activity_experience_slot", "activity_experience_mod_slot" and "My codes"
    - PeerCodes.xlsx
    with columns "Quotation Content" and "Codes"
    
Output files:
    - combinedCodes.xlsx
"""

fileMyCodes = 'OwnCodes.xlsx'
filePeerCodes = 'PeerCodes.xlsx'

dfMyCodes = pd.read_excel(fileMyCodes)
dfPeerCodes = pd.read_excel(filePeerCodes)


def compare_strings_ignore_spaces_and_linebreaks(str1, str2):
    if pd.isnull(str1) or pd.isnull(str2):
        return False
    # Remove duplicate spaces and newline characters
    str1_processed = ''.join(filter(str.isalpha, str1)) 
    str2_processed = ''.join(filter(str.isalpha, str2)) 

    # Compare the processed strings
    return str1_processed == str2_processed

def combineModAndRow(myRow,myModSlot):
    if (not(  pd.isnull(myRow)) and not(pd.isnull(myModSlot))):
        ModAndRow = myModSlot + myRow
    elif (not(pd.isnull(myModSlot))):
        ModAndRow = myModSlot
    else:
        ModAndRow = myRow
    return ModAndRow

# remove unimportant data from peercodes
drop_columns = [ 'Quotation Name']
dfPeerCodes.drop(columns=drop_columns, inplace=True)

# remove rows with just numerical numbers, becuase those are empty in the my own codes and cant be matched
dfPeerCodesFiltered = dfPeerCodes[~dfPeerCodes['Quotation Content'].astype(str).str.strip().str.lower().isin(['none', '']) & ~dfPeerCodes['Quotation Content'].astype(str).str.strip().str.isdigit()]

# add column to coded data for peer
dfMyCodes['PeerCodes'] = ''
# For each row in peer codes find the corresponding line in coded data by matching user free text responses
for peer_index, peer_row in dfPeerCodesFiltered.iterrows():
    found_match = False
    for my_index, my_row in dfMyCodes.iterrows():
        myRow = my_row['activity_experience_slot']
        myModSlot = my_row['activity_experience_mod_slot']
        # combine activity_experience_mod_slot and activity_experience_slot because
        # that is how the peer data has it, and these need to be the same
        ModAndRow = combineModAndRow(myRow,myModSlot)

        mycodes = my_row['My codes']
            
        if (compare_strings_ignore_spaces_and_linebreaks(peer_row['Quotation Content'], ModAndRow ) and not(pd.isnull(mycodes)) and mycodes.strip() != ''):
            found_match = True
            # only add if there isn't data in there yet, if there is then some text are the same and something has to be done
            if pd.isnull(my_row['PeerCodes']) or my_row['PeerCodes'] == '':
                dfMyCodes.at[my_index, 'PeerCodes'] = peer_row['Codes']
                break
            else:
                found_match = False
    if not found_match:
        print(f"No match found for Quotation Content: {peer_row['Quotation Content']} ID {peer_row['ID']} ")


dfAllCodes = dfMyCodes

def splitPeerCodes(otherCodes):
    otherCodesSplit = re.split(r'\s*\n\s*', otherCodes)
    return otherCodesSplit

for index, row in dfAllCodes.iterrows():
    mycodes = row['My codes']
    peerCodes = row['PeerCodes']

    if (not(pd.isnull(mycodes)) and mycodes.strip() != ''):

        if(peerCodes == ''):
            # if peer code is null
            if (mycodes.strip() == "Empty"):
                # if my codes has the code 'empty', then fill in empty in the peer code
                dfAllCodes.at[index, 'PeerCodes'] ="Empty"
            else:
                usertext = combineModAndRow(row['activity_experience_slot'], row['activity_experience_mod_slot'])
                print(f"Peer code is missing for entry. {usertext}: {mycodes}")

        else:
            peerCodesCommasep = ', '.join([code.strip() for code in splitPeerCodes(peerCodes)])
            dfAllCodes.at[index, 'PeerCodes'] = peerCodesCommasep


# add PID to every row
currentPID = ""
for index, row in dfAllCodes.iterrows():
    PID = row['rand_id']
    if(not (pd.isnull(PID))):
        currentPID = PID
    else:
        dfAllCodes.at[index, 'rand_id'] = currentPID
                

dfAllCodes.to_excel('combinedCodes.xlsx')